import pandas as pd
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3 as sq3
import pandas as pd
sns.set()
con = sq3.connect("opioid.db")
annual = pd.read_sql_query("SELECT * from annual", con)
# you have to close the connection
con.close
annual.head
<bound method NDFrame.head of BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips 0 1 ABBEVILLE SC 2006 877 363620 45001 1 2 ABBEVILLE SC 2007 908 402940 45001 2 3 ABBEVILLE SC 2008 871 424590 45001 3 4 ABBEVILLE SC 2009 930 467230 45001 4 5 ABBEVILLE SC 2010 1197 539280 45001 ... ... ... ... ... ... ... ... 27753 27754 NA NV 2007 447 200600 NA 27754 27755 NA NV 2008 5 2200 NA 27755 27756 NA OH 2006 23 5100 NA 27756 27757 NA PR 2006 10 17800 NA 27757 27758 NA PR 2007 2 1300 NA [27758 rows x 7 columns]>
annual.loc[(annual.BUYER_STATE == 'AR') & (annual.BUYER_COUNTY == 'MONTGOMERY'), 'countyfips'] = 5097
annual
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | ||
|---|---|---|---|---|---|---|---|
| 0 | 1 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | 2 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | 3 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | 4 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | 5 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 27753 | 27754 | NA | NV | 2007 | 447 | 200600 | NA |
| 27754 | 27755 | NA | NV | 2008 | 5 | 2200 | NA |
| 27755 | 27756 | NA | OH | 2006 | 23 | 5100 | NA |
| 27756 | 27757 | NA | PR | 2006 | 10 | 17800 | NA |
| 27757 | 27758 | NA | PR | 2007 | 2 | 1300 | NA |
27758 rows × 7 columns
annual.loc[(annual.BUYER_COUNTY == 'NA')]
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | ||
|---|---|---|---|---|---|---|---|
| 27741 | 27742 | NA | AE | 2006 | 2 | 330 | NA |
| 27742 | 27743 | NA | CA | 2006 | 47 | 12600 | NA |
| 27743 | 27744 | NA | CT | 2006 | 305 | 78700 | NA |
| 27744 | 27745 | NA | CT | 2007 | 112 | 30900 | NA |
| 27745 | 27746 | NA | CT | 2008 | 48 | 15000 | NA |
| 27746 | 27747 | NA | FL | 2006 | 9 | 900 | NA |
| 27747 | 27748 | NA | FL | 2007 | 7 | 700 | NA |
| 27748 | 27749 | NA | GA | 2006 | 114 | 51700 | NA |
| 27749 | 27750 | NA | IA | 2006 | 7 | 2300 | NA |
| 27750 | 27751 | NA | IN | 2006 | 292 | 39300 | NA |
| 27751 | 27752 | NA | MA | 2006 | 247 | 114900 | NA |
| 27752 | 27753 | NA | NV | 2006 | 380 | 173600 | NA |
| 27753 | 27754 | NA | NV | 2007 | 447 | 200600 | NA |
| 27754 | 27755 | NA | NV | 2008 | 5 | 2200 | NA |
| 27755 | 27756 | NA | OH | 2006 | 23 | 5100 | NA |
| 27756 | 27757 | NA | PR | 2006 | 10 | 17800 | NA |
| 27757 | 27758 | NA | PR | 2007 | 2 | 1300 | NA |
annual.drop(range(27741, 27758), axis=0)
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | ||
|---|---|---|---|---|---|---|---|
| 0 | 1 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | 2 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | 3 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | 4 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | 5 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 27736 | 27737 | ZAVALA | TX | 2010 | 248 | 200100 | 48507 |
| 27737 | 27738 | ZAVALA | TX | 2011 | 406 | 244800 | 48507 |
| 27738 | 27739 | ZAVALA | TX | 2012 | 473 | 263700 | 48507 |
| 27739 | 27740 | ZAVALA | TX | 2013 | 399 | 186700 | 48507 |
| 27740 | 27741 | ZAVALA | TX | 2014 | 162 | 148930 | 48507 |
27741 rows × 7 columns
annual = annual.assign(pills = pd.to_numeric((annual.DOSAGE_UNIT))/1000000)
annual_year = annual
annual_year = annual_year.groupby(['year']).pills.mean().reset_index().rename(columns = {'pills' : 'average_pills'})
fig = px.scatter(annual_year, x = "year", y='average_pills')
fig.show()